# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  tidyverse, dismo, DT, here, htmltools, leaflet, mapview, raster, rgbif, 
  rgdal, rJava, sdmpredictors, sf, spocc, geojsonio, GGally, mgcv, maptools,
  caret,       # m: modeling framework
  pdp,         # X: partial dependence plots
  ranger,      # m: random forest modeling
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip,         # X: variable importance
  dismo, # species distribution modeling: maxent(), predict(), evaluate(), 
  usdm  # uncertainty analysis for species distribution models: vifcor()
  )
select <- dplyr::select # overwrite raster::select

options(readr.show_col_type = FALSE, scipen = 999)

# set random seed for reproducibility
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = TRUE)

# file paths
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
dir_env <- file.path(dir_data, "env")
obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")

# logical, redo full calculations within if statements or not
redo <- FALSE

1 Explore

1.1 Species observations

Species: Red-tailed Hawk (Buteo jamaicensis) Source: Audubon Field Guide

Total observations after creating bounding box around habitat area and removing duplicate geometries.

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Buteo jamaicensis',
    from = 'gbif',
    has_coords = T,
    limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)

  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    # limit observations to bounding box around north and central america
    filter(between(longitude, -167.593385, -51.171266),
           between(latitude, 5.645215, 71.374349)) %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) %>%  # save space (joinable from obs_csv)
    distinct(geometry, .keep_all = TRUE) # remove duplicate geometries
  
  sf::write_sf(obs, obs_geo, delete_dsn = T)
}

obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9376

1.1.1 Map of species observation locations

# show points on map
mapview::mapview(obs, map.types = "Esri.WorldStreetMap")

1.2 Get environmental data

Explore environmental data options

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)

Choose environmental data layers

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", # altitude
                    "WC_bio1", # annual mean temperature
                    "WC_bio2", # mean diurnal temperature range
                    "ER_tri", # terrain roughness
                    "WC_bio12") # annual precipitation

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc = 2)

Create species range

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(list(obs, obs_hull))

1.2.1 Raster plot of environmental rasters clipped to species range

if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite = T)  
}
env_stack <- stack(env_stack_grd)

# show map
plot(env_stack, nc = 2)

1.2.2 Map of pseudo-absence points

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn = T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(present = 1) %>% 
      select(present, key),
    absence %>% mutate(present = 0, 
                       key = NA)) %>% 
    mutate(ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn = TRUE)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df = TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(pts %>% select(ID, present), 
              by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(lon = st_coordinates(geometry)[,1],
           lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(rownames = FALSE,
                options = list(dom = "t",
                               pageLength = 20))

1.2.3 Environmental data term plots

pts_env %>% 
  select(-ID) %>% 
  mutate(present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))

1.3 Questions

Question 1: There were a total of 7,004,451 observations in GBIF for the Red-tailed hawk (Buteo jamaicensis) as of 2022-01-24.
Question 2: There was one point from the original dataset that was in the ocean near Europe. To fix this issues, I created a bounding box around the parts of North and Central America and used this bbox to exclude any locations outside of the Red-tailed Hawks habitat.
Question 3: The environmental layers that I choose include: altitude, annual mean temperature, mean diurnal temperature range, terrain roughness, and annual precipitation. In the literature I found, other studies used: maximum and minimum temperatures, precipitation, solar radiation, wind speed, and cloud cover as abiotic factors. I felt that the first two variables from the literature were covered by annual mean temperature, mean diurnal temperature range, and annual precipitation. The other variables were not present in the available layers, so I did not include them.

2 Logistic Regression

2.1 Plot of ggpairs

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present), alpha = 0.5))

Set up data

# setup model data
model_data <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop rows with NA values
nrow(model_data)
## [1] 18682

2.2 Linear Model

# fit a linear model
mdl_lm <- lm(present ~ ., data = model_data) # . means all other columns (X)
summary(mdl_lm)
## 
## Call:
## lm(formula = present ~ ., data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13102 -0.41811  0.05709  0.41161  1.22059 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  0.46989444  0.07840354   5.993       0.000000002094 ***
## WC_alt      -0.00010576  0.00001235  -8.564 < 0.0000000000000002 ***
## WC_bio1      0.03177874  0.00218006  14.577 < 0.0000000000000002 ***
## WC_bio2     -0.04752572  0.00216064 -21.996 < 0.0000000000000002 ***
## ER_tri      -0.00029041  0.00013004  -2.233               0.0255 *  
## WC_bio12    -0.00027986  0.00001037 -26.980 < 0.0000000000000002 ***
## lon         -0.00149905  0.00030980  -4.839       0.000001316915 ***
## lat          0.01003139  0.00152964   6.558       0.000000000056 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.452 on 18674 degrees of freedom
## Multiple R-squared:  0.1832, Adjusted R-squared:  0.1828 
## F-statistic: 598.1 on 7 and 18674 DF,  p-value: < 0.00000000000000022
y_predict_lm <- predict(mdl_lm, model_data, type = "response")
y_true <- pts_env$present

range(y_predict_lm)
## [1] -0.5917617  1.1310212
range(y_true)
## [1] 0 1

2.3 Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ ., 
               family = binomial(link = "logit"), 
               data = model_data)
summary(mdl_glm)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = model_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5181  -1.0147   0.3938   0.9893   2.7853  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept) -0.32497197  0.40542651  -0.802             0.422810    
## WC_alt      -0.00056103  0.00006311  -8.889 < 0.0000000000000002 ***
## WC_bio1      0.16329620  0.01122489  14.548 < 0.0000000000000002 ***
## WC_bio2     -0.23208339  0.01141338 -20.334 < 0.0000000000000002 ***
## ER_tri      -0.00136951  0.00066749  -2.052             0.040195 *  
## WC_bio12    -0.00143635  0.00005592 -25.685 < 0.0000000000000002 ***
## lon         -0.00562434  0.00155676  -3.613             0.000303 ***
## lat          0.05724220  0.00797420   7.178    0.000000000000705 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25899  on 18681  degrees of freedom
## Residual deviance: 22069  on 18674  degrees of freedom
## AIC: 22085
## 
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, model_data, type = "response")

range(y_predict_glm)
## [1] 0.002979517 0.958016253

2.3.1 GLM Term Plots

termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F)

2.4 Generalized Additive Model

# fit a generalized additive model with smooth predictors
mdl_gam <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(WC_bio12) + s(lon) + s(lat), 
  family = binomial, data = model_data)

summary(mdl_gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(WC_bio12) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value   Pr(>|z|)    
## (Intercept) -0.38608    0.08701  -4.437 0.00000911 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq             p-value    
## s(WC_alt)   8.836  8.987  264.9 <0.0000000000000002 ***
## s(WC_bio1)  8.963  8.999 1255.3 <0.0000000000000002 ***
## s(WC_bio2)  7.613  8.306  658.3 <0.0000000000000002 ***
## s(ER_tri)   7.698  8.134  130.4 <0.0000000000000002 ***
## s(WC_bio12) 6.348  7.041  650.7 <0.0000000000000002 ***
## s(lon)      8.784  8.974  596.4 <0.0000000000000002 ***
## s(lat)      8.845  8.993  302.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.519   Deviance explained = 45.7%
## UBRE = -0.24079  Scale est. = 1         n = 18682

2.4.1 GAM Term Plots

plot(mdl_gam, scale = 0)

2.5 Maxent (Maximum Entropy)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds") # create new file path

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# plot environmental rasters
plot(env_stack, nc = 2)

2.5.1 Maxent variable contribution plot

# get presence-only observation points (maxent extracts raster values for you)
obs_sp <- sf::as_Spatial(obs) # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds) | redo){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl_maxent <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl_maxent)

2.5.2 Maxent term plots

# plot term plots
response(mdl_maxent)

2.5.3 Maxent prediction

y_predict_maxent <- predict(env_stack, mdl_maxent)

plot(y_predict_maxent, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border='dark grey')

2.6 Questions

Question 4: There are two environment variables that seem to contribute most towards presence. For altitude (WC_atl), below about 250 ft of altitude there is a higher chance of presence with a high confidence range. Where there is a diurnal temperature range (WC_bio2) between about 7 and 12 degree there is a higher chance of species presence. Additionally, between about -120 and -100 longitudinal degrees there is a higher chance of species presence.
Question 5: There are four environment variables that seem to contribute most towards presence for the maxent model. The first two, altitude (WC_alt) and diurnal temperature range (WC_bio2) were similar to the GAM results. For altitude, below about 500 ft of altitude there is a high chance of presence. Where there is a diurnal temperature range between about 5 and 15 degree there is a high chance of species presence. The other two variables (mean annual temperature and annual precipitation) had values that seems to predict species presence that did not show up in the GAM results. When the mean annual temperature (WC_bio1) is between 10 and 20 degrees there is a high chance of species presence. And when the annual precipitation (WC_bio12) is below 1,000 there is a high chance of species presence.

3 Decision Trees

Set up data

decision_tree_data <- pts_env %>% 
  select(-ID) %>%                         # not used as a predictor x
  mutate(present = factor(present)) %>%   # categorical response
  na.omit()                               # drop rows with NA

skim(decision_tree_data)
Data summary
Name decision_tree_data
Number of rows 18682
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 1: 9344, 0: 9338

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 555.52 640.74 -74.00 99.00 275.00 792.00 3813.00 ▇▂▁▁▁
WC_bio1 0 1 12.30 6.62 -5.70 7.70 12.30 17.30 29.00 ▂▅▇▇▂
WC_bio2 0 1 12.72 2.50 4.10 11.10 12.40 14.50 20.90 ▁▃▇▃▁
ER_tri 0 1 25.77 34.44 0.00 5.28 11.42 32.08 272.13 ▇▁▁▁▁
WC_bio12 0 1 792.04 458.62 49.00 435.00 755.00 1066.00 4827.00 ▇▃▁▁▁
lon 0 1 -100.60 16.63 -135.29 -117.06 -100.62 -86.80 -60.04 ▅▇▇▇▂
lat 0 1 37.97 9.00 9.21 33.10 38.53 43.78 60.46 ▁▂▇▇▂

3.1 Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(decision_tree_data, 
                                   prop = 0.8, 
                                   strata = "present")
d_train  <- rsample::training(d_split)
# show number of rows present is 0 vs 1 (all data)
table(decision_tree_data$present)
## 
##    0    1 
## 9338 9344
# show number of rows present is 0 vs 1 (training data)
table(d_train$present)
## 
##    0    1 
## 7470 7475

3.2 Rpart models

3.2.1 Partition, depth=1

# run decision stump model
mdl_dt1 <- rpart(present ~ ., 
                 data = d_train,
                 control = list(cp = 0, 
                                minbucket = 5, 
                                maxdepth = 1))
mdl_dt1
## n= 14945 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 14945 7470 1 (0.49983272 0.50016728)  
##   2) WC_bio1< 6.35 2756  181 0 (0.93432511 0.06567489) *
##   3) WC_bio1>=6.35 12189 4895 1 (0.40159160 0.59840840) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl_dt1)

3.2.2 Partition, depth=default

# decision tree with defaults
mdl_dt2 <- rpart(present ~ ., data = d_train)
mdl_dt2
## n= 14945 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 14945 7470 1 (0.49983272 0.50016728)  
##     2) WC_bio1< 6.35 2756  181 0 (0.93432511 0.06567489) *
##     3) WC_bio1>=6.35 12189 4895 1 (0.40159160 0.59840840)  
##       6) lon>=-116.1268 8654 4206 0 (0.51398197 0.48601803)  
##        12) lon< -80.37332 6710 2693 0 (0.59865872 0.40134128)  
##          24) WC_bio12>=1342.5 825   99 0 (0.88000000 0.12000000) *
##          25) WC_bio12< 1342.5 5885 2594 0 (0.55921835 0.44078165)  
##            50) WC_bio2>=12.85 3423 1123 0 (0.67192521 0.32807479) *
##            51) WC_bio2< 12.85 2462  991 1 (0.40251828 0.59748172)  
##             102) lon>=-96.47691 1777  877 1 (0.49352842 0.50647158)  
##               204) lat< 28.29523 192   29 0 (0.84895833 0.15104167) *
##               205) lat>=28.29523 1585  714 1 (0.45047319 0.54952681)  
##                 410) lon< -89.61047 531  194 0 (0.63465160 0.36534840) *
##                 411) lon>=-89.61047 1054  377 1 (0.35768501 0.64231499) *
##             103) lon< -96.47691 685  114 1 (0.16642336 0.83357664) *
##        13) lon>=-80.37332 1944  431 1 (0.22170782 0.77829218)  
##          26) lat< 38.57665 347  108 0 (0.68876081 0.31123919) *
##          27) lat>=38.57665 1597  192 1 (0.12022542 0.87977458) *
##       7) lon< -116.1268 3535  447 1 (0.12644979 0.87355021) *

3.2.3 Complexity parameter plots

rpart.plot(mdl_dt2)

# plot complexity parameter
plotcp(mdl_dt2)

# rpart cross validation results
mdl_dt2$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.32048193      0 1.0000000 1.0279786 0.008179589
## 2 0.08862115      1 0.6795181 0.6815261 0.007756026
## 3 0.03212851      3 0.5022758 0.5062918 0.007115127
## 4 0.01753681      5 0.4380187 0.4420348 0.006789729
## 5 0.01236055      6 0.4204819 0.4242303 0.006689466
## 6 0.01000000      9 0.3834003 0.4000000 0.006545350
# caret cross validation results
mdl_caret <- train(present ~ .,
                   data = d_train,
                   method = "rpart",
                   trControl  = trainControl(method = "cv", number = 10),
                   tuneLength = 20)

ggplot(mdl_caret)

3.2.4 Variable importance plot

vip(mdl_caret, num_features = 40, bar = FALSE)

Partial dependence plots

# Construct partial dependence plots
partial_dependency1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
partial_dependency2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
partial_dependency3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(partial_dependency1, partial_dependency2, partial_dependency3, ncol = 3)

3.3 Random Forests

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2908209

3.3.1 Variable importance plot

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(present ~ ., 
                       data = d_train,
                       importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(present ~ ., 
                          data = d_train,
                          importance = "permutation")

p_impurity <- vip::vip(mdl_impurity, bar = FALSE)
p_permutation <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p_impurity, p_permutation, nrow = 1)

3.4 Questions

Question 6: It is recommended that we use a tree of size 5.
Question 7: The top three most important variables for my model are: latitude, annual mean temperature (WC_bio1), and altitude (WC_alt).
Question 8: In the permutations importance plot for the RandomForest model the most important variables are longitude, latitude, and mean annual temperature (WC_bio1). This is different from the Rpart model, where the important variables were latitude, mean annual temperature (WC_bio1), and altitude (WC_alt).

4 Evaluate Models

Split observations into training and testing

pts <- read_sf(pts_geo)

# create training set with 80% of full data
pts_split  <- rsample::initial_split(pts, 
                                     prop = 0.8, 
                                     strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()

pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

4.1 Calibrate variable selection

4.1.1 Pairs plot of full environmental stack

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

4.1.2 Calculate VIF per variable

# calculate variance inflation factor (VIF) per predictor, a metric of multicollinearity between variables
vif(env_stack)
##   Variables      VIF
## 1    WC_alt 2.972943
## 2   WC_bio1 1.732601
## 3   WC_bio2 3.177039
## 4    ER_tri 1.874529
## 5  WC_bio12 2.282351

4.1.3 Variables after VIF collinearity removal

# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th = 0.7) 
v
## No variable from the 5 input variables has collinearity problem. 
## 
## The linear correlation coefficients ranges between: 
## min correlation ( ER_tri ~ WC_bio1 ):  -0.1380049 
## max correlation ( WC_bio12 ~ WC_bio2 ):  -0.589015 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1    WC_alt 2.729156
## 2   WC_bio1 1.707131
## 3   WC_bio2 3.111583
## 4    ER_tri 1.836237
## 5  WC_bio12 2.201158

4.1.4 Pair plots after VIF collinearity in environmental stack removed

# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

4.1.5 Variable contribution plot

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds) | redo){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds, sf::as_Spatial(pts_train))

# plot variable contributions per predictor
plot(mdl_maxv)

4.1.6 Maxent term plots

# plot term plots
response(mdl_maxv)

4.1.7 Maxent term predictions map

# predict
y_maxv <- predict(env_stack, mdl_maxv)

plot(y_maxv, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border = 'dark grey')

4.2 Evaluate model performance

4.2.1 ROC threshold value maximizing specificity and sensitivity

Reciever Operater Characteristic (ROC) Curve

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1873 
## n absences     : 1868 
## AUC            : 0.8747603 
## cor            : 0.636749 
## max TPR+TNR at : 0.6642459
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6642459

4.2.2 Confusion matrix

p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow = 2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs      0.72504   0.1381156
## absent_obs       0.27496   0.8618844

4.2.3 AUC plot

Area Under the Curve (AUC)

plot(e, 'ROC')

# add point to ROC plot
points(fpr, tpr, pch = 23, bg = "blue") ## BREAKS HERE

4.3 Map of binary habitat

plot(y_maxv > thr)

4.4 Questions

Question 9: No variables were removed due to multicollinearity as the highest correllation between variables is 0.6 for mean diurnal temperature range (WC_bio2) and annual precipiation (WC_bio12). The variables ranked in importance from most to least are:
- Annual mean temperature (WC_bio1)
- Altitude (WC_alt)
- Annual precipitation (WC_bio12)
- Terrain roughness (ER_tri)
- Mean Diurnal temperature range (WC_bio2)